library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(reshape2)
ny_house_data <- read_csv("NY-House-Dataset.csv", show_col_types = FALSE)
# For simplicity, remove rows with missing values
ny_house_data <- ny_house_data %>% na.omit()

# Convert relevant columns to numeric
ny_house_data$price <- as.numeric(ny_house_data$PRICE)
ny_house_data$beds <- as.numeric(ny_house_data$BEDS)
ny_house_data$bath <- as.numeric(ny_house_data$BATH)
ny_house_data$propertysqft <- as.numeric(ny_house_data$PROPERTYSQFT)

# Calculate the first quartile (Q1), third quartile (Q3), and IQR
Q1 <- quantile(ny_house_data$PRICE, 0.25)
Q3 <- quantile(ny_house_data$PRICE, 0.75)
IQR <- Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Filter out rows with `PRICE` values outside of the bounds
ny_house_data <- ny_house_data %>%
  filter(PRICE >= lower_bound & PRICE <= upper_bound)
#Extracting zip code from column STATE, then store in new column ZIP_CODE
library(stringr)
ny_house_data$ZIP_CODE <- str_extract(ny_house_data$STATE, "\\d{5}$")

#Add a new column, price per sqft
ny_house_data$PRICE_PER_SQFT<-ny_house_data$PRICE/ny_house_data$PROPERTYSQFT

# Group the data frame by ZIP_CODE and calculate the median price
median_prices_per_sqft <- ny_house_data %>%
  group_by(ZIP_CODE) %>%
  summarize(median_price = median(PRICE/PROPERTYSQFT, na.rm = TRUE))

median_prices_per_sqft$ZIP_CODE<-as.integer(median_prices_per_sqft$ZIP_CODE)
#Import Zip Code Boundaries file for New York City
ny_zipcode_shape<-read.csv("Modified_Zip_Code_Tabulation_Areas__MODZCTA_.csv")
zip_codes_with_prices <- left_join(ny_zipcode_shape,median_prices_per_sqft,by=c("MODZCTA"="ZIP_CODE"))

# Remove rows with missing values
zip_codes_with_prices <- zip_codes_with_prices[complete.cases(zip_codes_with_prices), ]

# Drop unnecessary columns by name
columns_to_drop <- c("label", "ZCTA","pop_est")
zip_codes_with_prices <- zip_codes_with_prices[, !(names(zip_codes_with_prices) %in% columns_to_drop)]

zip_codes_with_prices$MODZCTA<-as.numeric(zip_codes_with_prices$MODZCTA)
ny_house_data_map<-data.frame(
  FORMATTED_ADDRESS=ny_house_data$FORMATTED_ADDRESS,
  TYPE=ny_house_data$TYPE,
  TOTAL_PRICE=ny_house_data$PRICE,
  PRICE_PER_SQFT=ny_house_data$PRICE_PER_SQFT,
  LONGITUDE=ny_house_data$LONGITUDE,
  LATITUDE=ny_house_data$LATITUDE
)
# Calculate the correlation matrix
correlation_matrix <- ny_house_data %>%
  select(PROPERTYSQFT, BEDS, BATH, PRICE) %>%
  cor(use = "pairwise.complete.obs")

# Melt the correlation matrix for plotting
melted_correlation_matrix <- reshape2::melt(correlation_matrix)

# Plot the heatmap with blue color scheme, improved appearance, centered title, and bold headline
ggplot(melted_correlation_matrix, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white", size = 0.2) +  # Add white borders to tiles
  scale_fill_gradient2(low = "#d9f0ff", high = "#4287f5", mid = "white", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels
        plot.title = element_text(hjust = 0.5, face = "bold")) +  # Center plot title and make it bold
  labs(x = "", y = "") +  # Remove axis labels
  ggtitle(bquote(bold("Correlation Heatmap")))  # Set the centered and bold title
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Calculate average price by house type and sort in increasing order
average_price_by_type <- ny_house_data %>%
  group_by(TYPE) %>%
  summarize(AveragePrice = mean(PRICE, na.rm = TRUE)) %>%
  arrange(AveragePrice)

# Calculate incremental changes in average price
average_price_by_type <- average_price_by_type %>%
  mutate(Incremental = AveragePrice - lag(AveragePrice, default = first(AveragePrice)))

# Create a waterfall chart using ggplot2 with wider bars
ggplot(average_price_by_type, aes(x = reorder(TYPE, AveragePrice), y = Incremental, fill = Incremental > 0)) +
  geom_rect(aes(x = TYPE, xmin = as.numeric(as.factor(TYPE)) - 0.4, xmax = as.numeric(as.factor(TYPE)) + 0.4, ymin = lag(cumsum(Incremental), default = 0), ymax = cumsum(Incremental)), 
            color = "black", fill = ifelse(average_price_by_type$Incremental > 0, "#4287f5", "#d9f0ff")) +
  labs(
    title = "Waterfall Chart of Average Price Contributions by House Type",
    x = "House Type",
    y = "Incremental Average Price"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0))) +
  scale_fill_manual(values = c("#4287f5", "#d9f0ff"), guide = "none") +  # Blue color scheme
  ggtitle("Waterfall Chart of Average Price Contributions by House Type")  # Centered and bold title
## Warning in geom_rect(aes(x = TYPE, xmin = as.numeric(as.factor(TYPE)) - :
## Ignoring unknown aesthetics: x

# Create a boxplot of house prices by house type with a blue color palette
ggplot(ny_house_data, aes(x = TYPE, y = PRICE, fill = TYPE)) +
  geom_boxplot() +
  labs(
    title = "House Prices by House Type",
    x = "House Type",
    y = "Price"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "none") +
  scale_fill_manual(values = c("#4287f5","#4287f5","#4287f5","#4287f5","#4287f5","#4287f5","#4287f5","#4287f5","#4287f5","#4287f5","#4287f5","#4287f5","#4287f5")) +  # All blue colors
  ggtitle("House Prices by House Type")  # Centered and bold title

library(leaflet)
# Create the Leaflet map
p1 <- leaflet(ny_house_data_map) %>%
  addTiles() %>%    
  addCircleMarkers(lng = ~ny_house_data_map$LONGITUDE, lat = ~ny_house_data_map$LATITUDE,
             popup = ~paste("<b>Address:</b>", ny_house_data$FORMATTED_ADDRESS,"<br>",
                            "<b>Establishment Type:</b> ", ny_house_data$TYPE,"<br>",
                            "<b>Price per sqft:</b> ",PRICE_PER_SQFT=ny_house_data$PRICE_PER_SQFT),
               clusterOptions = markerClusterOptions())
p1
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(viridis)
## Loading required package: viridisLite
# Convert the character strings to spatial objects
zip_codes_with_prices <- st_as_sf(zip_codes_with_prices, wkt = "the_geom")

# Create a color palette based on the average prices
pal <- colorNumeric(palette = "viridis", domain = zip_codes_with_prices$median_price)

# Create a leaflet map centered over a specific location 
p2 <- leaflet(data = zip_codes_with_prices) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~pal(median_price),  # Color by median_price
    fillOpacity = 0.7,
    color = "black",  # Border color
    weight = 1,  # Border thickness
    popup = ~paste("Zip Code:", MODZCTA, "<br>Median Price:", median_price),  # Add popup info
    highlight = TRUE  # Highlight on hover
  ) %>%
  addLegend(
    pal = pal,
    values = zip_codes_with_prices$median_price,
    title = "Median Price Per Sqft",
    position = "bottomright"  # Legend position
  )

p2
ny_retail_data <- read.csv("retail-food-stores 1.csv")

# Convert relevant columns to numeric
ny_retail_data$Zip.Code <- as.numeric(ny_retail_data$Zip.Code)

#Select a subset of data interested 
ny_retail_data <- ny_retail_data %>%
  select(
    Name = DBA.Name,
    EstablishmentType = Establishment.Type,
    City = City,
    ZipCode = Zip.Code,
    Location = Location
  )

#Mutate the data to etract Latitude and Longtitude
ny_retail_data <- ny_retail_data %>%
  mutate(
    Latitude = as.numeric(str_extract(Location, "(?<='latitude': ')[^']+")) ,
    Longitude = as.numeric(str_extract(Location, "(?<='longitude': ')[^']+")),
    Location = NULL
  )

# For simplicity, remove rows with missing values
ny_retail_data <- ny_retail_data %>% na.omit()
ny_zip_codes <- zip_codes_with_prices %>%
  select(ZipCode=MODZCTA)

ny_retail_data <- ny_retail_data%>%
  filter(ZipCode %in% ny_zip_codes$ZipCode)
p3 <- leaflet(ny_retail_data) %>%
  addTiles() %>%    
  addCircleMarkers(lng = ~ny_retail_data$Longitude, lat = ~ny_retail_data$Latitude,
             popup = ~paste("<b>Name:</b>", ny_retail_data$Name,"<br>",
                            "<b>Establishment Type:</b> ", ny_retail_data$EstablishmentType,"<br>"),
               clusterOptions = markerClusterOptions())
p3
establishments_by_zip <- ny_retail_data %>%
  group_by(ZipCode) %>%
  summarise(
    num_establishments = n_distinct(Name)
  )

zip_codes_with_num <- left_join(zip_codes_with_prices,establishments_by_zip,by=c("MODZCTA"="ZipCode"))

zip_codes_with_num <- zip_codes_with_num %>% na.omit()

str(zip_codes_with_num)
## Classes 'sf' and 'data.frame':   170 obs. of  4 variables:
##  $ MODZCTA           : num  10001 10002 10003 10026 10004 ...
##  $ median_price      : num  570 831 893 595 1036 ...
##  $ num_establishments: int  70 192 77 49 10 10 7 21 76 46 ...
##  $ the_geom          :sfc_MULTIPOLYGON of length 170; first list element: List of 1
##   ..$ :List of 1
##   .. ..$ : num [1:84, 1:2] -74 -74 -74 -74 -74 ...
##   ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
##  - attr(*, "sf_column")= chr "the_geom"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA
##   ..- attr(*, "names")= chr [1:3] "MODZCTA" "median_price" "num_establishments"
##  - attr(*, "na.action")= 'omit' Named int [1:4] 83 135 137 147
##   ..- attr(*, "names")= chr [1:4] "83" "135" "137" "147"
# Convert the character strings to spatial objects
zip_codes_with_num <- st_as_sf(zip_codes_with_num, wkt = "the_geom")

# Create a color palette based on the average prices
pal2 <- colorNumeric(palette = "viridis", domain = zip_codes_with_num$num_establishments)

# Create a leaflet map centered over a specific location 
p4 <- leaflet(data = zip_codes_with_num) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~pal2(num_establishments),  # Color by median_price
    fillOpacity = 0.7,
    color = "black",  # Border color
    weight = 1,  # Border thickness
    popup = ~paste("Zip Code:", MODZCTA, "<br>Number of Retail Establishment:", num_establishments),  # Add popup info
    highlight = TRUE  # Highlight on hover
  ) %>%
  addLegend(
    pal = pal2,
    values = zip_codes_with_num$num_establishments,
    title = "Number of Retail Establishment",
    position = "bottomright"  
  )

p4
retail_food_stores <- read_csv("retail-food-stores_sub.csv")
## Rows: 2639 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): County, Operation Type, Establishment Type, Entity Name, DBA Name,...
## dbl  (5): License Number, Zip Code, Square Footage, Counties, New York Zip C...
## lgl  (2): Address Line 2, Address Line 3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(stringr)

# Extract zip code from 'Main_address' column in ny_house_data
ny_house_data <- ny_house_data %>%
  mutate(Main_address_zip = str_extract(MAIN_ADDRESS, "\\d{5}"))

# Convert 'Zip Code' column in retail_food_stores to character
retail_food_stores <- retail_food_stores %>%
  mutate(`Zip Code` = as.character(`Zip Code`))

# Join datasets using 'Zip Code' from retail_food_stores and 'Main_address_zip' from ny_house_data
combined_data <- inner_join(retail_food_stores, ny_house_data, by = c("Zip Code" = "Main_address_zip"))
## Warning in inner_join(retail_food_stores, ny_house_data, by = c(`Zip Code` = "Main_address_zip")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 80 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Aggregate the data by zip code, housing price, and use the number of rows as a proxy for the count of retail stores
aggregated_data <- combined_data %>%
  group_by(`Zip Code`, PRICE) %>%
  summarise(Retail_Store_Count = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'Zip Code'. You can override using the
## `.groups` argument.
# Create a scatter plot
ggplot(aggregated_data, aes(x = `Zip Code`, y = PRICE, color = Retail_Store_Count)) +
  geom_point() +
  labs(
    title = "Scatter Plot of Neighborhood Housing Price vs. Retail Store Count",
    x = "Zip Code",
    y = "Neighborhood Housing Price",
    color = "Retail Store Count"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)
  )

library(readr)
library(dplyr)  
library(tidyverse)  
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)  
library(plotly)   
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(lubridate)  

Load dataset

ny_house_data <- read_csv("NY-House-Dataset.csv", show_col_types = FALSE)
head(ny_house_data)
## # A tibble: 6 × 17
##   BROKERTITLE   TYPE   PRICE  BEDS  BATH PROPERTYSQFT ADDRESS STATE MAIN_ADDRESS
##   <chr>         <chr>  <dbl> <dbl> <dbl>        <dbl> <chr>   <chr> <chr>       
## 1 Brokered by … Cond… 3.15e5     2  2            1400 2 E 55… New … 2 E 55th St…
## 2 Brokered by … Cond… 1.95e8     7 10           17545 Centra… New … Central Par…
## 3 Brokered by … Hous… 2.6 e5     4  2            2015 620 Si… Stat… 620 Sinclai…
## 4 Brokered by … Cond… 6.90e4     3  1             445 2 E 55… Manh… 2 E 55th St…
## 5 Brokered by … Town… 5.50e7     7  2.37        14175 5 E 64… New … 5 E 64th St…
## 6 Brokered by … Hous… 6.90e5     5  2            4004 584 Pa… Broo… 584 Park Pl…
## # ℹ 8 more variables: ADMINISTRATIVE_AREA_LEVEL_2 <chr>, LOCALITY <chr>,
## #   SUBLOCALITY <chr>, STREET_NAME <chr>, LONG_NAME <chr>,
## #   FORMATTED_ADDRESS <chr>, LATITUDE <dbl>, LONGITUDE <dbl>
# For simplicity, remove rows with missing values
ny_house_data <- ny_house_data %>% na.omit()

# Convert relevant columns to numeric
ny_house_data$price <- as.numeric(ny_house_data$PRICE)
ny_house_data$beds <- as.numeric(ny_house_data$BEDS)
ny_house_data$bath <- as.numeric(ny_house_data$BATH)
ny_house_data$propertysqft <- as.numeric(ny_house_data$PROPERTYSQFT)

# Calculate the first quartile (Q1), third quartile (Q3), and IQR
Q1 <- quantile(ny_house_data$PRICE, 0.25)
Q3 <- quantile(ny_house_data$PRICE, 0.75)
IQR <- Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Filter out rows with `PRICE` values outside of the bounds
ny_house_data <- ny_house_data %>%
  filter(PRICE >= lower_bound & PRICE <= upper_bound)
# Count the number of occurrences of each property type
property_counts <- ny_house_data %>%
  count(TYPE, sort = TRUE)  # Count and sort by the number of each type

# Create an interactive bar chart
interactive_bar_chart <- plot_ly(data = property_counts, x = ~TYPE, y = ~n, type = 'bar', marker = list(color = 'rgb(158,202,225)', line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
  layout(title = "Property Type Counts", xaxis = list(title = "Property Type"), yaxis = list(title = "Count"))

# Display the interactive plot
interactive_bar_chart
# Summary statistics for each broker
broker_summary <- ny_house_data %>%
  group_by(BROKERTITLE) %>%
  summarise(
    Average_Price = mean(PRICE, na.rm = TRUE),
    Median_Price = median(PRICE, na.rm = TRUE),
    Min_Price = min(PRICE, na.rm = TRUE),
    Max_Price = max(PRICE, na.rm = TRUE),
    Count = n()
  ) %>%
  arrange(desc(Average_Price))  # Arrange by average price in descending order

# View the summary
print(broker_summary)
## # A tibble: 1,012 × 6
##    BROKERTITLE              Average_Price Median_Price Min_Price Max_Price Count
##    <chr>                            <dbl>        <dbl>     <dbl>     <dbl> <int>
##  1 Brokered by Anita Loewy        2795000      2795000   2795000   2795000     1
##  2 Brokered by Exit Realty…       2780000      2780000   2780000   2780000     1
##  3 Brokered by TREE OF LIF…       2759000      2759000   2759000   2759000     1
##  4 Brokered by Mattia Real…       2749000      2749000   2749000   2749000     1
##  5 Brokered by Anne Lopa R…       2595000      2595000   2595000   2595000     1
##  6 Brokered by Halstead Pr…       2550000      2550000   2550000   2550000     1
##  7 Brokered by Real Estate…       2500000      2500000   2500000   2500000     1
##  8 Brokered by Toscana Pro…       2500000      2500000   2500000   2500000     1
##  9 Brokered by Wolf Proper…       2500000      2500000   2500000   2500000     2
## 10 Brokered by Ny Manageme…       2499000      2499000   2499000   2499000     1
## # ℹ 1,002 more rows
library(dplyr)
library(ggplot2)

# Calculate median prices and rank brokers
broker_ranks <- ny_house_data %>%
  group_by(BROKERTITLE) %>%
  summarise(Median_Price = median(PRICE, na.rm = TRUE), .groups = 'drop') %>%
  arrange(desc(Median_Price))

top_brokers <- head(broker_ranks, 10)$BROKERTITLE
bottom_brokers <- tail(broker_ranks, 10)$BROKERTITLE

# Filter data for top and bottom brokers
top_bottom_data <- ny_house_data %>%
  filter(BROKERTITLE %in% c(top_brokers, bottom_brokers))


library(ggplot2)
library(plotly)

# Creating Scatter Plot
scatter_plot <- ggplot(top_bottom_data, aes(x = PROPERTYSQFT, y = PRICE, color = BROKERTITLE)) +
  geom_point(alpha = 0.6) +
  scale_color_manual(values = rep("blue", 20)) +
  labs(title = "Scatter Plot of House Prices vs. Property Square Footage",
       x = "Property Square Footage (sq ft)",
       y = "House Price") +
  theme_minimal() +
  theme(legend.position = "top", 
        legend.text = element_text(size = 10),
        axis.text.x = element_text(angle = 45, hjust = 1))


ggplotly(scatter_plot)
library(dplyr)
library(ggplot2)

# Calculate median prices for each broker and identify top 10 and bottom 10
broker_medians <- ny_house_data %>%
  group_by(BROKERTITLE) %>%
  summarise(Median_Price = median(PRICE, na.rm = TRUE), .groups = 'drop') %>%
  arrange(Median_Price)

top10_brokers <- head(broker_medians, 10)
bottom10_brokers <- tail(broker_medians, 10)

# Combine top 10 and bottom 10 brokers
selected_brokers <- rbind(top10_brokers, bottom10_brokers)
selected_broker_titles <- selected_brokers$BROKERTITLE

# Filter the original dataset to only include these brokers
filtered_data <- ny_house_data %>%
  filter(BROKERTITLE %in% selected_broker_titles)



# Define price bins
filtered_data$Price_Bin <- cut(filtered_data$PRICE, breaks = seq(min(filtered_data$PRICE), max(filtered_data$PRICE), by = 50000), include.lowest = TRUE)

# Calculate the number of properties within each price range for each broker
price_distribution <- filtered_data %>%
  group_by(BROKERTITLE, Price_Bin) %>%
  summarise(Count = n(), .groups = 'drop')

# Create a heatmap
heatmap_plot <- ggplot(price_distribution, aes(x = BROKERTITLE, y = Price_Bin, fill = Count)) +
  geom_tile() +  # Fill with tiles
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Count") +  # Blue gradient
  labs(title = "House Price Distribution Heatmap by Broker", x = "Broker", y = "Price Range") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels for readability
        legend.position = "right")

# Display the heatmap
print(heatmap_plot)

ggsave("Broker_Price_Heatmap.png", heatmap_plot, width = 10, height = 6, dpi = 300)